import requests
import re
import util

logger = util.logger('static/豆瓣电影')

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36',
}
for i in range(10):
    logger.info(f'开爬第{i}页....')
    url = f'https://movie.douban.com/top250?start={i * 25}&filter='
    response = requests.get(url, headers=headers)
    results = re.findall(
        r'<div class="pic">.*?<a href="(.*?)">.*?<img width="100" alt="(.*?)" src="(.*?)">.*?</a>.*?</div>',
        response.text, re.S)
    for result in results:
        print(result)
        detail_url = result[0]
        title = result[1]
        img_url = result[2]
        detail_response = requests.get(detail_url, headers=headers)
        logger.info(f'开爬{title}简介')
        summary = re.search('<span class="all hidden">(.*?)</span>', detail_response.text, re.S)
        if not summary:
            summary = re.search('<span property="v:summary">(.*?)</span>', detail_response.text, re.S)
        logger.info(f'开爬{title}海报')
        img_response = requests.get(img_url, headers=headers)
        summary = summary.group(1)
        str = ''
        for s in summary.strip().split('<br />'):
            str += s.strip()
        with open('static/豆瓣电影/detail.txt', 'a', encoding='utf8') as f:
            f.write(title + '\t\t' + str + '\n')
        with open(f'static/豆瓣电影/img/{title}.jpg', 'wb') as f:
            f.write(img_response.content)

    logger.info(f'第{i}页爬取结束....')
logger.info('over....')
